How I Have Used Data Science

5/13/25

Tommy Matheis

Where Have I Used Data Science?

I was able to utilize skills learned in this class in primarily two places this semester:

\(\bullet\) Introduction to Statistics, Math 58 with Professor Chandler
\(\bullet\) The Student Life Newspaper, on the data team

What parts of Data Science did I use?

So far, the aspects of data science that I have found most useful include:

\(\bullet\) Data visualization with ggplot
\(\bullet\) Using Tidyverse verbs to organize and clean data
\(\bullet\) Permutation/randomization tests to simulate a probability
\(\bullet\) Web scraping

Analyzing Relative Age Effect for Canadian NHL Players

#Utilizing a given data-set "Bdays.recent"

ggplot(Bdays.recent, aes(x = year, y = NumDays)) +
  geom_point(color = "black") +
  geom_hline(yintercept = 182.5, color = "red", linetype = "dashed") +
  labs(
    x = "Year",
    y = "Day of the Year",
    title = "Birthdays of Canadian NHL Players with Last Name Beginning with \"L\"",
    subtitle = "n = 88"
  ) + theme_minimal(base_family = "Palatino")

Plotting Observed Data vs Simulated Data

# Create a data frame for observed vs. expected counts
observed_counts <- c(26, 36, 12, 14)
expected_counts <- rep(88 / 4, 4)
quarters <- c("Q1", "Q2", "Q3", "Q4")

data_plot <- data.frame(
  Quarter = rep(quarters, 2),
  Count = c(observed_counts, expected_counts),
  Type = rep(c("Observed", "Expected"), each = 4)
)
#Plot the bar char
ggplot(data_plot, aes(x = Quarter, y = Count, fill = Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Observed vs. Expected Player Birth Distribution", 
       x = "Birth Quarter", 
       y = "Number of Players") +
  scale_fill_manual(values=c("orange","skyblue"),
                    labels=c("Expected","Observed")) +
  theme_minimal(base_family = "Palatino")

Analyzing Average Lag-Time Between Local and Non-Local Runners

#Utilizing a given data set "TMR.mini"

ggplot(TMR.mini, aes(x=DMV, y=lag, fill=DMV)) +
  geom_boxplot() +
  labs(title="Comparison of Lag Times by Runner Origin",
       subtitle = "n = 20 local, n = 10 non-local",
       x="Local?",
       y="Lag (seconds)") +
  scale_fill_manual(values=c("orange","skyblue"),
                    labels=c("Non-local","Local")) +
  theme_minimal(base_family = "Palatino")

Pomona-Pitzer Athletics Spending

ppeada1 <- ppeada |> 
  filter(sport != "Baseball", sport != "Football", sport != "Softball", sport != "Volleyball", sport != "Lacrosse")
ppeada2 <- ppeada23 |> 
  filter(sport != "Baseball", sport != "Football", sport != "Softball", sport != "Volleyball", sport != "Lacrosse")

ggplot(ppeada2, aes(x = sport, y = expenses, fill = sex)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_fill_manual(
    values = c(
      "M"   = "#0057b8",   
      "F" = "#f7941d"    
    ),
    name = "Team"
  ) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(
    x = "", y = "Proportion of Funding",
    title = "Proportions of Funding Between Men's and Women's Teams by Sport",
    subtitle = "2022-2023"
  ) +
  theme_minimal(base_family = "Palatino")

Pomona-Pitzer Athletics Spending

ggplot(ppeada1, aes(x = sport, y = expenses, fill = sex)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_fill_manual(
    values = c(
      "M"   = "#0057b8",   
      "F" = "#f7941d"   
    ),
    name = "Team"
  ) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(
    x = "", y = "Proportion of Funding",
    title = "Proportions of Funding Between Men's and Women's Teams by Sport",
    subtitle = "2023-2024"
  ) +
  theme_minimal(base_family = "Palatino")

Fall 2025 Pre-Registration

#Using data collected from Hyperschedule every 15 minutes during registration

ggplot(num_closed, aes(x = DateTime, y = closed)) +
  geom_line(color = "blue", linewidth = 1.7) +
  geom_vline(xintercept = as.POSIXct("2025-04-22 08:00:00", tz = "America/Los_Angeles"), color = "red", linetype = "dashed") +
  annotate("text",
           x = as.POSIXct("2025-04-22 9:00:00",
                          tz = "America/Los_Angeles"),
           y = 700,            
           label = "Start of Senior \nRegistration", hjust = 0, size = 3.5, family = "Palatino") +
  geom_vline(xintercept = as.POSIXct("2025-04-23 08:00:00", tz = "America/Los_Angeles"), color = "red", linetype = "dashed") +
  annotate("text",
           x = as.POSIXct("2025-04-23 9:00:00",
                          tz = "America/Los_Angeles"),
           y = 700,            
           label = "Start of Sophomore \nRegistration", hjust = 0, size = 3.5, family = "Palatino") +
  geom_vline(xintercept = as.POSIXct("2025-04-24 08:00:00", tz = "America/Los_Angeles"), color = "red", linetype = "dashed") +
  annotate("text",
           x = as.POSIXct("2025-04-24 9:00:00",
                          tz = "America/Los_Angeles"),
           y = 700,            
           label = "Start of Freshmen \nRegistration", hjust = 0, size = 3.5, family = "Palatino") +
  scale_y_continuous(limits = c(0, 750),
                     breaks  = seq(0, 750, by = 100)) +
  scale_x_datetime(
    breaks = seq(from   = start,
                 to     = max(num_closed$DateTime),   # or a hard date
                 by     = "8 hours"),
    timezone     = "America/Los_Angeles",   # <-- converts tick labels
    date_labels  = "%b %d\n%I:%M %p"           # e.g. "Apr 23\n01:00"
  ) +
  labs(
    x = "",
    y = "Number of Courses",
    title = "Courses Closed During Fall 2025 Pre-Registration"
  ) +
  theme_minimal(base_family = "Palatino") +
  theme(plot.title = element_text(size = 18))

Current Project: Scraping Swimcloud.com

swimmers_2025_M <- function(page) {
  url <- paste0("https://www.swimcloud.com/recruiting/rankings/2025/M/1/?page=", page)
  page <- read_html(url)
  
  ranking <- page |>
    html_elements(".u-pr0") |>
    html_text() |>
    as.numeric()

  power_index <- page |>
    html_elements("td.u-text-end") |>
    html_text() |>
    as.numeric()

  state <- page |>
    html_elements(".u-text-small") |>
    html_text() |>
    str_extract("[A-Z]{2}$")

  swimmers <- tibble(
    ranking = ranking,
    power_index = power_index,
    state = state,
    sex = "M"
  )
}

pages <- 1:537
top_swimmers_2025_M <- map_dfr(pages, swimmers_2025_M) |>
  write.csv("top_swimmers_2025_M.csv")

Thank you!

```